5.0 10X Genomics PBMC 3K Dataset

In [1]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)
df = {}

import clustergrammer_groupby as cby
import gene_exp_10x
In [2]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline 

Load Data

In [3]:
df['ge-ini'] = gene_exp_10x.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df['ge-ini'].shape
Out[3]:
(32738, 2700)
In [4]:
all_genes = df['ge-ini'].index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))

df['ge'] = df['ge-ini'].loc[keep_genes]
df['ge'].shape

# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
                'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

all_genes = df['ge'].index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
             x.split('_')[0] in list_mito_genes]
print(mito_genes)

keep_genes = [x for x in all_genes if x not in mito_genes]
df['ge'] = df['ge'].ix[keep_genes]

# normalize by UMI count
barcode_umi_sum = df['ge'].sum()
df['ge'] = df['ge'].div(barcode_umi_sum)
32738
32546
['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L7', 'MTRNR2L5', 'MTRNR2L8', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']
In [5]:
net.load_df(df['ge'])
net.normalize(axis='row', norm_type='zscore')
net.swap_nan_for_zero()
df['ge-z'] = net.export_df()
df['ge-z'].shape
Out[5]:
(32520, 2700)

Visualize Original Dataset

In [6]:
net.load_df(df['ge'])
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()
Widget Javascript not detected.  It may not be installed or enabled properly.

Load NM'3337 gene sigantures

In [7]:
net.load_file('../data/cell_type_signatures/nm3337_broad_cell_type_sigs.txt')
df['bct-sig'] = net.export_df()
print(df['bct-sig'].shape)

net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')
df['nct-sig'] = net.export_df()
print(df['nct-sig'].shape)
(523, 9)
(523, 22)
In [8]:
net.load_df(df['nct-sig'])
net.cluster()
net.widget()
Widget Javascript not detected.  It may not be installed or enabled properly.
In [9]:
sig_rows = df['bct-sig'].index.tolist()
clean_sig_rows = [x.split('_')[0] for x in sig_rows]
print(len(clean_sig_rows), len(list(set(clean_sig_rows))))
523 523
In [10]:
ge_rows = df['ge'].index.tolist()
clean_ge_rows = [x.split('_')[0] for x in ge_rows]
print(len(ge_rows), len(list(set(clean_ge_rows))))
32520 32425
In [11]:
ser_ge_rows = pd.Series(clean_ge_rows)
In [12]:
gene_name_count = ser_ge_rows.value_counts(ascending=False)
duplicate_genes = gene_name_count[gene_name_count > 1].index.tolist()
len(duplicate_genes)
Out[12]:
91

only add unique index to duplicate genes

In [13]:
dup_index = {}
new_rows = []
for inst_row in clean_ge_rows:
    
    # add index to non-unique genes
    if inst_row in duplicate_genes:
        
        # calc non-unique index
        if inst_row not in dup_index:
            dup_index[inst_row] = 1
        else:
            dup_index[inst_row] = dup_index[inst_row]  + 1
            
        new_row = inst_row + '_' + str(dup_index[inst_row])
        
    else:
        new_row = inst_row
        
    new_rows.append(new_row)
In [14]:
print(len(new_rows))
print(len(list(set(new_rows))))
32520
32520
In [15]:
# df['ge'].index = new_rows
# df['ge-z'].index = new_rows

Predict Cell Types using NM3337 Signatures

In [16]:
rows = df['nct-sig'].index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df['nct-sig'].index = new_rows
In [17]:
df['nct-sig'].columns.tolist()
Out[17]:
[('B cells naive', 'B cells naive'),
 ('B cells memory', 'B cells memory'),
 ('Plasma cells', 'Plasma cells'),
 ('T cells CD8', 'T cells CD8'),
 ('T cells CD4 naive', 'T cells CD4 naive'),
 ('T cells CD4 memory resting', 'T cells CD4 memory resting'),
 ('T cells CD4 memory activated', 'T cells CD4 memory activated'),
 ('T cells follicular helper', 'T cells follicular helper'),
 ('T cells regulatory (Tregs)', 'T cells regulatory (Tregs)'),
 ('T cells gamma delta', 'T cells gamma delta'),
 ('NK cells resting', 'NK cells resting'),
 ('NK cells activated', 'NK cells activated'),
 ('Monocytes', 'Monocytes'),
 ('Macrophages M0', 'Macrophages M0'),
 ('Macrophages M1', 'Macrophages M1'),
 ('Macrophages M2', 'Macrophages M2'),
 ('Dendritic cells resting', 'Dendritic cells resting'),
 ('Dendritic cells activated', 'Dendritic cells activated'),
 ('Mast cells resting', 'Mast cells resting'),
 ('Mast cells activated', 'Mast cells activated'),
 ('Eosinophils', 'Eosinophils'),
 ('Neutrophils', 'Neutrophils')]
In [18]:
rows = df['bct-sig'].index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df['bct-sig'].index = new_rows
In [19]:
# rows = df['ge-z'].index.tolist()
# new_rows = [x.split('_')[0] for x in rows]
# df['ge-z'].index = new_rows
In [20]:
df['pred_cat'], df['sig_sim'], y_info = cby.predict_cats_from_sigs(df['ge-z'], df['bct-sig'], 
                                                                   predict_level='Cell Type', unknown_thresh=0.05)
In [21]:
net.load_df(df['pred_cat'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD8', inst_color='red')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()
Widget Javascript not detected.  It may not be installed or enabled properly.
In [22]:
df['ge-cat'] = deepcopy(df['ge'])
df['ge-cat'].shape
Out[22]:
(32520, 2700)
In [23]:
# transfer predicted categories to full dataset and add UMI count
cat_cols = df['pred_cat'].columns.tolist()
df['ge-cat'].columns = cat_cols

# new_cols = [(x[0], x[1], 'UMI: ' + str(barcode_umi_sum[x[0]])) for x in cat_cols]

df['ge-cat-umi'] = deepcopy(df['ge-cat'])
# df['ge-cat-umi'].columns = new_cols
# print(df['ge-cat-umi'].shape)
In [24]:
net.load_df(df['ge-cat-umi'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD8', inst_color='red')
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.normalize(axis='row', norm_type='zscore')
df['tmp'] = net.export_df()
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()
Widget Javascript not detected.  It may not be installed or enabled properly.
In [25]:
df['tmp'].shape
Out[25]:
(250, 250)

Make more general comparison

In [26]:
sim_dict, pval_dict = cby.sim_same_and_diff_category_samples(df['tmp'])
In [27]:
sim_dict['same'].mean()
Out[27]:
0.1033134243768756
In [28]:
sim_dict['diff'].mean()
Out[28]:
-0.024238409009971908
In [29]:
pval_dict
Out[29]:
{'mannwhitney': 0.0, 'ttest': 0.0}

Signature Prediction

In [30]:
%%time
net.load_df(df['ge-cat-umi'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['ge-cat-umi-z'] = net.export_df() 

df['cat-sig'], keep_genes, keep_genes_dict = cby.generate_signatures(
                                                          df['ge-cat-umi-z'], 
                                                          'Cell Type', pval_cutoff=0.05)

df['pred_cat'], df['sig_sim'], y_info = cby.predict_cats_from_sigs(df['ge-cat-umi-z'], 
                                                                   df['cat-sig'])

df['conf'], populations, ser_correct, fraction_correct = cby.confusion_matrix_and_correct_series(y_info)
print('fraction correct: ', fraction_correct)
print(f1_score(y_info['true'], y_info['pred'], average='macro'))
print(f1_score(y_info['true'], y_info['pred'], average='micro'))
print(f1_score(y_info['true'], y_info['pred'], average='weighted'))
fraction correct:  0.734074074074
0.619396343163
0.734074074074
0.718106748157
CPU times: user 1.34 s, sys: 1.03 s, total: 2.38 s
Wall time: 2.41 s

Shuffle

In [31]:
%%time
num_shuffles = 100
perform_ser = cby.compare_performance_to_shuffled_labels(df['ge-cat-umi-z'], 'Cell Type', 
                                                         num_shuffles=num_shuffles)
print('mean: ', perform_ser.mean(), 'std: ', perform_ser.std())
print('previously calc real performance: ', fraction_correct)
performance (fraction correct) of unshuffled: 0.734074074074
mean:  0.3608814814814815 std:  0.023286747359150414
previously calc real performance:  0.734074074074
CPU times: user 40.6 s, sys: 9.22 s, total: 49.8 s
Wall time: 49.9 s
In [32]:
perform_ser[perform_ser > fraction_correct]
Out[32]:
Series([], dtype: float64)
In [33]:
perform_ser.hist()
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x121b529e8>
In [34]:
(fraction_correct - perform_ser.mean())/perform_ser.std()
Out[34]:
16.025964761710199